from datetime import datetime, timedelta, date
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
pyoff.init_notebook_mode()
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import missingno as msno
customer = pd.read_csv('data/customer.csv')
newcustomer = pd.read_csv('data/newCustomer.csv')
transaction = pd.read_csv('data/transaction.csv')
transaction['transaction_date'] = pd.to_datetime(transaction['transaction_date'])
transaction['month_year'] = pd.to_datetime(transaction['transaction_date']).dt.to_period('M')
margin_month = transaction.groupby('month_year')['margin'].sum().reset_index()
margin_month['month_year'] = margin_month['month_year'].astype(str)
margin_month['monthlygrowth'] = margin_month['margin'].pct_change()
plot_data = [
go.Scatter(
x=margin_month['month_year'],
y=margin_month['margin'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
plot_data = [
go.Scatter(
x=margin_month['month_year'],
y=margin_month['monthlygrowth'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Growth Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
customer_month = transaction.groupby('month_year')['customer_id'].nunique().reset_index()
customer_month['month_year'] = customer_month['month_year'].astype(str)
plot_data = [
go.Bar(
x=customer_month['month_year'],
y=customer_month['customer_id'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Number of Active Customers '
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
transaction_month = transaction.groupby('month_year').size().to_frame('size').reset_index()
transaction_month['month_year'] = transaction_month['month_year'].astype(str)
plot_data = [
go.Bar(
x=transaction_month['month_year'],
y=transaction_month['size'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Number of Transaction'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
As the methodology, we need to calculate Recency, Frequency and Monetary Value (Margin) and apply unsupervised machine learning to identity different groups (clusters) for each.
Recency: To Calculate recency, we need to find out most recent purchase date of each customer and ses how many days for they are inactive for. After having # of inactive days for each customer, we will apply K-means clustering to assign customers a recency score
Frequency: To create frequency cluster, we need to find total number orders for each customer. First calculate this and see how frequency look like in oouor customer database.
Monetary Value: Let's see how our customer database loooks like when we cluster them based on revenue. We will calculate revenue foor each customoer, plot a histogram and apply the same clustering method.
transaction.head()
def recency_days(x):
return (transaction['transaction_date'].max() - x.max()).days
customer_rfm = transaction\
.groupby('customer_id')\
.agg({'transaction_date': recency_days,
'transaction_id': 'count',
'list_price': 'sum',
'margin': 'sum'})\
.rename(columns = {'transaction_date': 'recency',
'transaction_id': 'frequency',
'list_price': 'monetary_value',
'margin': 'ltv'})\
.reset_index()
# histplot on the rfm
def plot_rfm(col, title):
plot_data = [
go.Histogram(
x = customer_rfm[col])
]
plot_layout = go.Layout(
title = title
)
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
# Kmean Elbow Plot
def elbow_method(col):
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(customer_rfm[[col]])
sse[k] = kmeans.inertia_
plt.figure();
plt.plot(list(sse.keys()), list(sse.values()));
plt.xlabel('Number of Cluster')
plt.show();
# calculate kmean cluster df
def cluster_df(n_clusters, col, cluster_name, ascending):
kmeans = KMeans(n_clusters = n_clusters)
kmeans.fit(customer_rfm[[col]])
customer_rfm[cluster_name] = kmeans.predict(customer_rfm[[col]])
reorder_temp_df = customer_rfm\
.groupby(cluster_name)[col]\
.describe()\
.sort_values(by = 'mean', ascending = ascending)\
.reset_index()
reorder_temp_df.index.name = cluster_name + '_reorder'
reorder_temp_df.reset_index(inplace = True)
return reorder_temp_df
# Recency
plot_rfm('recency', 'Recency');
elbow_method('recency')
recency_cluster_reorder = cluster_df(n_clusters = 4,
col = 'recency',
cluster_name = 'recency_cluster',
ascending = False)
recency_cluster_reorder
# Frequency
plot_rfm('frequency', 'Frequency');
elbow_method('frequency')
frequency_cluster_reorder = cluster_df(n_clusters = 4,
col = 'frequency',
cluster_name = 'frequency_cluster',
ascending = True)
frequency_cluster_reorder
# Monetary Value
plot_rfm('monetary_value', 'Monetary Value');
elbow_method('monetary_value')
monetary_value_cluster_reorder = cluster_df(n_clusters = 4,
col = 'monetary_value',
cluster_name = 'monetary_value_cluster',
ascending = True)
monetary_value_cluster_reorder
# LTV
plot_rfm('ltv', 'Lifetime Value');
elbow_method('ltv')
ltv_cluster_reorder = cluster_df(n_clusters = 3,
col = 'ltv',
cluster_name = 'ltv_cluster',
ascending = True)
ltv_cluster_reorder
# At all customers
all_clusters = customer_rfm\
.merge(monetary_value_cluster_reorder[['monetary_value_cluster_reorder', 'monetary_value_cluster']], on = 'monetary_value_cluster')\
.merge(frequency_cluster_reorder[['frequency_cluster_reorder', 'frequency_cluster']], on = 'frequency_cluster')\
.merge(recency_cluster_reorder[['recency_cluster_reorder', 'recency_cluster']], on = 'recency_cluster')\
.merge(ltv_cluster_reorder[['ltv_cluster_reorder', 'ltv_cluster']], on = 'ltv_cluster')
all_clusters = all_clusters[['customer_id', 'recency', 'frequency', 'monetary_value', 'ltv',
'monetary_value_cluster_reorder', 'frequency_cluster_reorder',
'recency_cluster_reorder', 'ltv_cluster_reorder']]
all_clusters['overall'] = all_clusters['recency_cluster_reorder'] + all_clusters['frequency_cluster_reorder'] + all_clusters['monetary_value_cluster_reorder']
all_clusters.head()
all_clusters.groupby('overall')['recency', 'frequency', 'monetary_value', 'ltv'].mean()
all_clusters['segment'] = 'Low-Value'
all_clusters.loc[all_clusters['overall'] > 2, 'segment'] = 'Mid-Value'
all_clusters.loc[all_clusters['overall'] > 7, 'segment'] = 'High-Value'
plt.figure(figsize = (15, 30))
plt.subplot(4,1,1)
sns.scatterplot(x = 'recency', y = 'frequency', hue = 'segment', data = all_clusters);
plt.subplot(4,1,2)
sns.scatterplot(x = 'recency', y = 'monetary_value', hue = 'segment', data = all_clusters);
plt.subplot(4,1,3)
sns.scatterplot(x = 'frequency', y = 'monetary_value', hue = 'segment', data = all_clusters);
plt.subplot(4,1,4)
sns.scatterplot(x = 'overall', y = 'ltv', hue = 'segment', data = all_clusters);
sns.despine();
LTV prediction with XGBoost Multi-classification
Lifetime Value: Total Gross Revenue - Total Cost
features = customer.merge(all_clusters[['customer_id', 'ltv', 'segment']],
on = 'customer_id')[['customer_id','segment', 'gender','past_3_years_bike_related_purchases',
'wealth_segment', 'owns_car', 'tenure', 'state', 'age_group',
'property_valuation', 'age', 'ltv']]
features_dummy = pd.get_dummies(features, columns = ['wealth_segment', 'state', 'owns_car', 'gender'], drop_first = True)
#c = ['gender', 'past_3_years_bike_related_purchases',
# 'wealth_segment','owns_car', 'tenure', 'state',
# 'property_valuation', 'age', 'segment']
#d = {'Low-Value': 0,
# 'Mid-Value': 1,
# 'High-Value': 2}
#df.loc[df.my_channel > 20000, 'my_channel'] = 0
def feature_selection(col):
features_c = features_dummy[col]
features_c = features_c.dropna()
y = features_c.pop('ltv')
#y = np.where(y == 'Low-Value', 0, np.where(y == 'Mid-Value', 1, 2))
X = features_c
return X, y
features.to_csv('data/features.csv', index=False)
features_dummy.columns
#col = ['past_3_years_bike_related_purchases', 'tenure', 'property_valuation',
# 'age', 'segment', 'wealth_segment_High Net Worth',
# 'wealth_segment_Mass Customer', 'state_QLD', 'state_VIC','owns_car_Yes', 'gender_Male']
col = ['past_3_years_bike_related_purchases', 'tenure', 'property_valuation',
'age', 'ltv']
X, y = feature_selection(col)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
features['age_group'] = features['age_group'].astype(str)
features.head()
df = features.groupby('age_group')['ltv'].mean().reset_index()[: -1]
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df,
x="age_group",
y = 'ltv', color = '#5D6D7E');
ax.patches[-2].set_color('#E9967A')
ax.patches[-1].set_color('#95A5A6')
ax,set_ticks([])
ax.set_title('customer lifetime value by age group')
ax.set_ylabel('Customer Lifetime Value')
ax.set_xlabel('Age Group')
sns.despine();
df = features.groupby('wealth_segment')['ltv'].mean().reset_index()
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df,
x = "wealth_segment",
y = 'ltv',
color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
df = features.groupby('tenure')['ltv'].mean().reset_index().sort_values('ltv', ascending = False)
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df,
x = "tenure",
y = 'ltv',
color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
df = features.groupby('property_valuation')['ltv'].mean().reset_index()
df.head()
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df,
x = "property_valuation",
y = 'ltv',
color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
df = features.groupby('state')['ltv'].mean().reset_index()
df.head()
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df,
x = "state",
y = 'ltv',
color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
import matplotlib.pyplot as plt
# Data to plot
labels = ['Low CLV','Mid CLV', 'High CLV']
sizes = [1493, 1432, 564]
labels_gender = ['Female','Male','Female','Male','Female','Male']
sizes_gender = [769, 698, 717, 680, 272, 276]
colors = ['#99ff99', '#ffcc99', '#ff6666']
colors_gender = ['#ffb3e6', '#c2c2f0','#ffb3e6','#c2c2f0','#ffb3e6','#c2c2f0']
explode = (0.2,0.2,0.2)
explode_gender = (0.1,0.1,0.1,0.1,0.1,0.1)
#Plot
plt.pie(sizes, labels=labels, colors=colors, startangle=90,frame=True, explode=explode,radius=3)
plt.pie(sizes_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=2 )
#Draw circle
centre_circle = plt.Circle((0,0),1.5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.axis('equal')
plt.tight_layout()
plt.show()
labels = ['Female', 'Male']
sizes = [1758, 1654]
labels_gender = ['Low CLT', 'Mid CLT', 'High CLT', 'Low CLT', 'Mid CLT', 'High CLT']
sizes_gender = [769, 717, 272, 698, 680, 276]
colors_gender = ['#99ff99', '#ffcc99', '#ff6666', '#99ff99', '#ffcc99', '#ff6666']
colors = ['#ffb3e6', '#c2c2f0']
explode_gender = (0.1,0.1, 0.1, 0.1, 0.1, 0.1)
explode = (0.2, 0.2)
#Plot
plt.pie(sizes, labels=labels, colors=colors, startangle=90,frame=True, explode=explode,radius=3)
plt.pie(sizes_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=2 )
#Draw circle
centre_circle = plt.Circle((0,0),1.5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.axis('equal')
plt.tight_layout()
plt.show()
features.gender.value_counts()
features.groupby(['gender', 'ltv_cluster_reorder']).size()
plt.figure(figsize = (15, 15))
sns.histplot(data=features,
x="tenure",
kde=True,
hue = 'ltv_cluster_reorder',
bins = 5);
sns.despine();